Assignment 1

Author

Sabrina (Hsi-Hsuan) Yang

library(data.table)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between()     masks data.table::between()
✖ dplyr::filter()      masks stats::filter()
✖ dplyr::first()       masks data.table::first()
✖ lubridate::hour()    masks data.table::hour()
✖ lubridate::isoweek() masks data.table::isoweek()
✖ dplyr::lag()         masks stats::lag()
✖ dplyr::last()        masks data.table::last()
✖ lubridate::mday()    masks data.table::mday()
✖ lubridate::minute()  masks data.table::minute()
✖ lubridate::month()   masks data.table::month()
✖ lubridate::quarter() masks data.table::quarter()
✖ lubridate::second()  masks data.table::second()
✖ purrr::transpose()   masks data.table::transpose()
✖ lubridate::wday()    masks data.table::wday()
✖ lubridate::week()    masks data.table::week()
✖ lubridate::yday()    masks data.table::yday()
✖ lubridate::year()    masks data.table::year()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data_2002 <- data.table::fread("/Users/sabrinayang/Downloads/data_2002.csv")
data_2022 <- data.table::fread("/Users/sabrinayang/Downloads/data_2022.csv")
#Check dimensions
dim(data_2002)
[1] 15976    20
dim(data_2022)
[1] 57761    20
# Check the first few rows (headers) for each dataset
head(data_2002)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 01/05/2002    AQS 60010007   1                           25.1 ug/m3 LC
2: 01/06/2002    AQS 60010007   1                           31.6 ug/m3 LC
3: 01/08/2002    AQS 60010007   1                           21.4 ug/m3 LC
4: 01/11/2002    AQS 60010007   1                           25.9 ug/m3 LC
5: 01/14/2002    AQS 60010007   1                           34.5 ug/m3 LC
6: 01/17/2002    AQS 60010007   1                           41.0 ug/m3 LC
   DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              78 Livermore               1              100
2:              92 Livermore               1              100
3:              71 Livermore               1              100
4:              80 Livermore               1              100
5:              98 Livermore               1              100
6:             115 Livermore               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     41860
2:              88101 PM2.5 - Local Conditions     41860
3:              88101 PM2.5 - Local Conditions     41860
4:              88101 PM2.5 - Local Conditions     41860
5:              88101 PM2.5 - Local Conditions     41860
6:              88101 PM2.5 - Local Conditions     41860
                           CBSA_NAME STATE_CODE      STATE COUNTY_CODE  COUNTY
1: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
2: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
3: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
4: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
5: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
6: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
   SITE_LATITUDE SITE_LONGITUDE
1:      37.68753      -121.7842
2:      37.68753      -121.7842
3:      37.68753      -121.7842
4:      37.68753      -121.7842
5:      37.68753      -121.7842
6:      37.68753      -121.7842
head(data_2022)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 01/01/2022    AQS 60010007   3                           12.7 ug/m3 LC
2: 01/02/2022    AQS 60010007   3                           13.9 ug/m3 LC
3: 01/03/2022    AQS 60010007   3                            7.1 ug/m3 LC
4: 01/04/2022    AQS 60010007   3                            3.7 ug/m3 LC
5: 01/05/2022    AQS 60010007   3                            4.2 ug/m3 LC
6: 01/06/2022    AQS 60010007   3                            3.8 ug/m3 LC
   DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              52 Livermore               1              100
2:              55 Livermore               1              100
3:              30 Livermore               1              100
4:              15 Livermore               1              100
5:              18 Livermore               1              100
6:              16 Livermore               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     41860
2:              88101 PM2.5 - Local Conditions     41860
3:              88101 PM2.5 - Local Conditions     41860
4:              88101 PM2.5 - Local Conditions     41860
5:              88101 PM2.5 - Local Conditions     41860
6:              88101 PM2.5 - Local Conditions     41860
                           CBSA_NAME STATE_CODE      STATE COUNTY_CODE  COUNTY
1: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
2: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
3: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
4: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
5: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
6: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
   SITE_LATITUDE SITE_LONGITUDE
1:      37.68753      -121.7842
2:      37.68753      -121.7842
3:      37.68753      -121.7842
4:      37.68753      -121.7842
5:      37.68753      -121.7842
6:      37.68753      -121.7842
# Check the last few rows (footers) for each dataset
tail(data_2002)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 12/10/2002    AQS 61131003   1                             15 ug/m3 LC
2: 12/13/2002    AQS 61131003   1                             15 ug/m3 LC
3: 12/22/2002    AQS 61131003   1                              1 ug/m3 LC
4: 12/25/2002    AQS 61131003   1                             23 ug/m3 LC
5: 12/28/2002    AQS 61131003   1                              5 ug/m3 LC
6: 12/31/2002    AQS 61131003   1                              6 ug/m3 LC
   DAILY_AQI_VALUE            Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              57 Woodland-Gibson Road               1              100
2:              57 Woodland-Gibson Road               1              100
3:               4 Woodland-Gibson Road               1              100
4:              74 Woodland-Gibson Road               1              100
5:              21 Woodland-Gibson Road               1              100
6:              25 Woodland-Gibson Road               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     40900
2:              88101 PM2.5 - Local Conditions     40900
3:              88101 PM2.5 - Local Conditions     40900
4:              88101 PM2.5 - Local Conditions     40900
5:              88101 PM2.5 - Local Conditions     40900
6:              88101 PM2.5 - Local Conditions     40900
                                 CBSA_NAME STATE_CODE      STATE COUNTY_CODE
1: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
2: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
3: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
4: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
5: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
6: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
   COUNTY SITE_LATITUDE SITE_LONGITUDE
1:   Yolo      38.66121      -121.7327
2:   Yolo      38.66121      -121.7327
3:   Yolo      38.66121      -121.7327
4:   Yolo      38.66121      -121.7327
5:   Yolo      38.66121      -121.7327
6:   Yolo      38.66121      -121.7327
tail(data_2022)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 12/01/2022    AQS 61131003   1                            3.4 ug/m3 LC
2: 12/07/2022    AQS 61131003   1                            3.8 ug/m3 LC
3: 12/13/2022    AQS 61131003   1                            6.0 ug/m3 LC
4: 12/19/2022    AQS 61131003   1                           34.8 ug/m3 LC
5: 12/25/2022    AQS 61131003   1                           23.2 ug/m3 LC
6: 12/31/2022    AQS 61131003   1                            1.0 ug/m3 LC
   DAILY_AQI_VALUE            Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              14 Woodland-Gibson Road               1              100
2:              16 Woodland-Gibson Road               1              100
3:              25 Woodland-Gibson Road               1              100
4:              99 Woodland-Gibson Road               1              100
5:              74 Woodland-Gibson Road               1              100
6:               4 Woodland-Gibson Road               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     40900
2:              88101 PM2.5 - Local Conditions     40900
3:              88101 PM2.5 - Local Conditions     40900
4:              88101 PM2.5 - Local Conditions     40900
5:              88101 PM2.5 - Local Conditions     40900
6:              88101 PM2.5 - Local Conditions     40900
                                 CBSA_NAME STATE_CODE      STATE COUNTY_CODE
1: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
2: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
3: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
4: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
5: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
6: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
   COUNTY SITE_LATITUDE SITE_LONGITUDE
1:   Yolo      38.66121      -121.7327
2:   Yolo      38.66121      -121.7327
3:   Yolo      38.66121      -121.7327
4:   Yolo      38.66121      -121.7327
5:   Yolo      38.66121      -121.7327
6:   Yolo      38.66121      -121.7327
# Check variable names and types
str(data_2002)
Classes 'data.table' and 'data.frame':  15976 obs. of  20 variables:
 $ Date                          : chr  "01/05/2002" "01/06/2002" "01/08/2002" "01/11/2002" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Daily Mean PM2.5 Concentration: num  25.1 31.6 21.4 25.9 34.5 41 29.3 15 18.8 37.9 ...
 $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ DAILY_AQI_VALUE               : int  78 92 71 80 98 115 87 57 65 107 ...
 $ Site Name                     : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS_PARAMETER_CODE            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ CBSA_CODE                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA_NAME                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
 $ STATE                         : chr  "California" "California" "California" "California" ...
 $ COUNTY_CODE                   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ COUNTY                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ SITE_LATITUDE                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ SITE_LONGITUDE                : num  -122 -122 -122 -122 -122 ...
 - attr(*, ".internal.selfref")=<externalptr> 
str(data_2022)
Classes 'data.table' and 'data.frame':  57761 obs. of  20 variables:
 $ Date                          : chr  "01/01/2022" "01/02/2022" "01/03/2022" "01/04/2022" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  3 3 3 3 3 3 3 3 3 3 ...
 $ Daily Mean PM2.5 Concentration: num  12.7 13.9 7.1 3.7 4.2 3.8 2.3 6.9 13.6 11.2 ...
 $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ DAILY_AQI_VALUE               : int  52 55 30 15 18 16 10 29 54 47 ...
 $ Site Name                     : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS_PARAMETER_CODE            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ CBSA_CODE                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA_NAME                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
 $ STATE                         : chr  "California" "California" "California" "California" ...
 $ COUNTY_CODE                   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ COUNTY                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ SITE_LATITUDE                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ SITE_LONGITUDE                : num  -122 -122 -122 -122 -122 ...
 - attr(*, ".internal.selfref")=<externalptr> 
any(is.na(data_2002$DailyMeanPM2.5Concentration))
[1] FALSE
any(is.na(data_2022$DailyMeanPM2.5Concentration))
[1] FALSE

There is no missing data in daily mean PM2.5 concentration for both data sets.

library(dplyr)
data_2002 <- data_2002 %>% mutate(year = 2002)
data_2022 <- data_2022 %>% mutate(year = 2022)
combined_20022022 <- bind_rows(data_2002, data_2022)
str(combined_20022022)
Classes 'data.table' and 'data.frame':  73737 obs. of  21 variables:
 $ Date                          : chr  "01/05/2002" "01/06/2002" "01/08/2002" "01/11/2002" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Daily Mean PM2.5 Concentration: num  25.1 31.6 21.4 25.9 34.5 41 29.3 15 18.8 37.9 ...
 $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ DAILY_AQI_VALUE               : int  78 92 71 80 98 115 87 57 65 107 ...
 $ Site Name                     : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS_PARAMETER_CODE            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ CBSA_CODE                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA_NAME                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
 $ STATE                         : chr  "California" "California" "California" "California" ...
 $ COUNTY_CODE                   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ COUNTY                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ SITE_LATITUDE                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ SITE_LONGITUDE                : num  -122 -122 -122 -122 -122 ...
 $ year                          : num  2002 2002 2002 2002 2002 ...
 - attr(*, ".internal.selfref")=<externalptr> 
library(leaflet)
leaflet(combined_20022022)
color_palette <- colorFactor(palette = "Set3", domain = unique(combined_20022022$year))
# Create the map
leaflet(combined_20022022) %>%
  addTiles() %>%
  addCircleMarkers(
    lng = ~SITE_LONGITUDE,
    lat = ~SITE_LATITUDE,
    radius = 2,
    color = ~color_palette(year),
    popup = ~paste("Year:", year),
    label = ~as.character(year)
  ) %>%
  addLegend(
    "bottomright",
    pal = color_palette,
    values = ~year,
    title = "Year",
    opacity = 1
  )

The map shows where the data were collected in 2002 and 2022 within the state of California.

missing_pm25 <- sum(is.na(combined_20022022$DailyMeanPM2.5Concentration))
prop_missing_pm25 <- missing_pm25 / nrow(combined_20022022)
print("Missing Values in PM2.5:")
[1] "Missing Values in PM2.5:"
print(missing_pm25)
[1] 0
print("Proportion of Missing Values:")
[1] "Proportion of Missing Values:"
print(prop_missing_pm25)
[1] 0
summary_pm25 <- summary(combined_20022022$DailyMeanPM2.5Concentration)
print("Summary Statistics for PM2.5:")
[1] "Summary Statistics for PM2.5:"
print(summary_pm25)
Length  Class   Mode 
     0   NULL   NULL 
library(ggplot2)
library(dplyr)
#on a state level
combined_20022022<-combined_20022022 %>% rename(PM2.5=`Daily Mean PM2.5 Concentration`)
ggplot(combined_20022022, aes(x = year, y = PM2.5)) +
  geom_line(stat = "summary", fun = "mean") +
  labs(title = "PM2.5 Concentration in California by Year",
       x = "Year",
       y = "PM2.5 Concentration")

summary_state <- aggregate(PM2.5 ~ year, data = combined_20022022, FUN = mean)
print(summary_state)
  year     PM2.5
1 2002 16.115943
2 2022  8.564708

Based on the line graph, it indicated that the PM2.5 concentration decreased throughout the years between 2002 and 2022. Also, the summary statistics indicated that the PM2.5 concentration in 2022 is 8.564708 and 2002 is 16.115943, which showed a decrease.

#on a county level 
#boxplot
combined_20022022<-combined_20022022 %>% rename(county=`COUNTY`)
ggplot(combined_20022022, aes(x = county, y = PM2.5)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

  labs(title = "PM2.5 Distribution by County",
       x = "County",
       y = "Mean PM2.5 Concentration")
$x
[1] "County"

$y
[1] "Mean PM2.5 Concentration"

$title
[1] "PM2.5 Distribution by County"

attr(,"class")
[1] "labels"
#summary 
summary_county <- combined_20022022 %>%
  group_by(county) %>%
  summarise(mean_PM2.5 = mean(PM2.5),
            median_PM2.5 = median(PM2.5),
            sd_PM2.5 = sd(PM2.5))
print(summary_county)
# A tibble: 51 × 4
   county       mean_PM2.5 median_PM2.5 sd_PM2.5
   <chr>             <dbl>        <dbl>    <dbl>
 1 Alameda            8.81         7.2      6.21
 2 Butte              8.71         6        8.90
 3 Calaveras          6.60         5.3      4.71
 4 Colusa             8.40         7        6.32
 5 Contra Costa       9.95         7.8      8.92
 6 Del Norte          4.75         4.05     3.43
 7 El Dorado          4.47         3.1      7.21
 8 Fresno            12.3          8.4     12.1 
 9 Glenn              5.34         4.4      4.98
10 Humboldt           7.11         6        4.45
# ℹ 41 more rows

The overall lowest mean PM2.5 is in El Dorado county, which is 4.471330, and the highest mean PM2.5 is in Kern county, which is 15.594534, based on the summary.According to the box plot, the two highest outliers on PM2.5 concentration are in Placer and Siskiyou county, which are around 300.

#for sites in LA
library(data.table)
library(tidyverse)
la_2002 <- data.table::fread("/Users/sabrinayang/Downloads/la_2002.csv")
la_2022 <- data.table::fread("/Users/sabrinayang/Downloads/la_2022.csv")
#Check dimensions
dim(la_2002)
[1] 2349   20
dim(la_2022)
[1] 6016   20
#Check the first few rows (headers) for each dataset
head(la_2002)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 01/01/2002    AQS 60370002   1                           32.3 ug/m3 LC
2: 01/02/2002    AQS 60370002   1                           57.2 ug/m3 LC
3: 01/03/2002    AQS 60370002   1                           39.2 ug/m3 LC
4: 01/04/2002    AQS 60370002   1                           23.2 ug/m3 LC
5: 01/05/2002    AQS 60370002   1                            7.3 ug/m3 LC
6: 01/07/2002    AQS 60370002   1                            7.3 ug/m3 LC
   DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              93     Azusa               1              100
2:             152     Azusa               1              100
3:             110     Azusa               1              100
4:              74     Azusa               1              100
5:              30     Azusa               1              100
6:              30     Azusa               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     31080
2:              88101 PM2.5 - Local Conditions     31080
3:              88101 PM2.5 - Local Conditions     31080
4:              88101 PM2.5 - Local Conditions     31080
5:              88101 PM2.5 - Local Conditions     31080
6:              88101 PM2.5 - Local Conditions     31080
                            CBSA_NAME STATE_CODE      STATE COUNTY_CODE
1: Los Angeles-Long Beach-Anaheim, CA          6 California          37
2: Los Angeles-Long Beach-Anaheim, CA          6 California          37
3: Los Angeles-Long Beach-Anaheim, CA          6 California          37
4: Los Angeles-Long Beach-Anaheim, CA          6 California          37
5: Los Angeles-Long Beach-Anaheim, CA          6 California          37
6: Los Angeles-Long Beach-Anaheim, CA          6 California          37
        COUNTY SITE_LATITUDE SITE_LONGITUDE
1: Los Angeles       34.1365      -117.9239
2: Los Angeles       34.1365      -117.9239
3: Los Angeles       34.1365      -117.9239
4: Los Angeles       34.1365      -117.9239
5: Los Angeles       34.1365      -117.9239
6: Los Angeles       34.1365      -117.9239
head(la_2022)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 01/05/2022    AQS 60370002   1                           10.7 ug/m3 LC
2: 01/11/2022    AQS 60370002   1                            3.1 ug/m3 LC
3: 01/17/2022    AQS 60370002   1                           11.9 ug/m3 LC
4: 01/23/2022    AQS 60370002   1                            3.5 ug/m3 LC
5: 01/26/2022    AQS 60370002   1                            3.4 ug/m3 LC
6: 01/29/2022    AQS 60370002   1                            4.3 ug/m3 LC
   DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              45     Azusa               1              100
2:              13     Azusa               1              100
3:              50     Azusa               1              100
4:              15     Azusa               1              100
5:              14     Azusa               1              100
6:              18     Azusa               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     31080
2:              88101 PM2.5 - Local Conditions     31080
3:              88101 PM2.5 - Local Conditions     31080
4:              88101 PM2.5 - Local Conditions     31080
5:              88101 PM2.5 - Local Conditions     31080
6:              88101 PM2.5 - Local Conditions     31080
                            CBSA_NAME STATE_CODE      STATE COUNTY_CODE
1: Los Angeles-Long Beach-Anaheim, CA          6 California          37
2: Los Angeles-Long Beach-Anaheim, CA          6 California          37
3: Los Angeles-Long Beach-Anaheim, CA          6 California          37
4: Los Angeles-Long Beach-Anaheim, CA          6 California          37
5: Los Angeles-Long Beach-Anaheim, CA          6 California          37
6: Los Angeles-Long Beach-Anaheim, CA          6 California          37
        COUNTY SITE_LATITUDE SITE_LONGITUDE
1: Los Angeles       34.1365      -117.9239
2: Los Angeles       34.1365      -117.9239
3: Los Angeles       34.1365      -117.9239
4: Los Angeles       34.1365      -117.9239
5: Los Angeles       34.1365      -117.9239
6: Los Angeles       34.1365      -117.9239
any(is.na(la_2002$DailyMeanPM2.5Concentration))
[1] FALSE
any(is.na(la_2022$DailyMeanPM2.5Concentration))
[1] FALSE
#Combine datasets
library(dplyr)
la_2002 <- la_2002 %>% mutate(year = 2002)
la_2022 <- la_2022 %>% mutate(year = 2022)
combinedla_20022022 <- bind_rows(la_2002, la_2022)
str(combinedla_20022022)
Classes 'data.table' and 'data.frame':  8365 obs. of  21 variables:
 $ Date                          : chr  "01/01/2002" "01/02/2002" "01/03/2002" "01/04/2002" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site ID                       : int  60370002 60370002 60370002 60370002 60370002 60370002 60370002 60370002 60370002 60370002 ...
 $ POC                           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Daily Mean PM2.5 Concentration: num  32.3 57.2 39.2 23.2 7.3 7.3 17.2 24.9 21.7 11.2 ...
 $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ DAILY_AQI_VALUE               : int  93 152 110 74 30 30 62 78 71 47 ...
 $ Site Name                     : chr  "Azusa" "Azusa" "Azusa" "Azusa" ...
 $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS_PARAMETER_CODE            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ CBSA_CODE                     : int  31080 31080 31080 31080 31080 31080 31080 31080 31080 31080 ...
 $ CBSA_NAME                     : chr  "Los Angeles-Long Beach-Anaheim, CA" "Los Angeles-Long Beach-Anaheim, CA" "Los Angeles-Long Beach-Anaheim, CA" "Los Angeles-Long Beach-Anaheim, CA" ...
 $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
 $ STATE                         : chr  "California" "California" "California" "California" ...
 $ COUNTY_CODE                   : int  37 37 37 37 37 37 37 37 37 37 ...
 $ COUNTY                        : chr  "Los Angeles" "Los Angeles" "Los Angeles" "Los Angeles" ...
 $ SITE_LATITUDE                 : num  34.1 34.1 34.1 34.1 34.1 ...
 $ SITE_LONGITUDE                : num  -118 -118 -118 -118 -118 ...
 $ year                          : num  2002 2002 2002 2002 2002 ...
 - attr(*, ".internal.selfref")=<externalptr> 
#create map
library(leaflet)
color_combinedla <- colorFactor(palette = "Set1", domain = unique(combinedla_20022022$year))
# Create the map
leaflet(combinedla_20022022) %>%
  addTiles() %>%
  addCircleMarkers(
    lng = ~SITE_LONGITUDE,
    lat = ~SITE_LATITUDE,
    radius = 2,
    color = ~color_combinedla(year),
    popup = ~paste("Year:", year),
    label = ~as.character(year)
  ) %>%
  addLegend(
    "bottomright",
    pal = color_combinedla,
    values = ~year,
    title = "Year",
    opacity = 1
  )

The map shows the data collected in 2002 and 2022 in the area of the greater Los Angeles area.

#change variable names
combinedla_20022022<-combinedla_20022022 %>% rename(site=`Site Name`)
combinedla_20022022<-combinedla_20022022 %>% rename(PM2.5=`Daily Mean PM2.5 Concentration`)
#create boxplot
ggplot(combinedla_20022022, aes(x = site, y = PM2.5)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 40, hjust = 1))

  labs(title = "PM2.5 Distribution by site in LA",
       x = "Site",
       y = "PM2.5 Concentration")
$x
[1] "Site"

$y
[1] "PM2.5 Concentration"

$title
[1] "PM2.5 Distribution by site in LA"

attr(,"class")
[1] "labels"
#create histogram
ggplot(combinedla_20022022, aes(x = PM2.5)) +
  geom_histogram(binwidth = 5) +
  facet_wrap(~ site) +
  labs(title = "PM2.5 Distribution by site",
       x = "PM2.5 Concentration",
       y = "Frequency")

#summary
summary_site <- combinedla_20022022 %>%
  group_by(site) %>%
  summarise(mean_PM2.5 = mean(PM2.5),
            median_PM2.5 = median(PM2.5),
            sd_PM2.5 = sd(PM2.5))
print(summary_site)
# A tibble: 20 × 4
   site                             mean_PM2.5 median_PM2.5 sd_PM2.5
   <chr>                                 <dbl>        <dbl>    <dbl>
 1 ""                                    23.9          21.5    12.3 
 2 "Anaheim"                             12.3          10.2     8.32
 3 "Azusa"                               18.7          16.7    11.9 
 4 "Burbank"                             24.0          21.7    12.7 
 5 "Compton"                             13.0          11.9     6.22
 6 "Glendora"                             8.42          7.8     5.47
 7 "Lancaster-Division Street"            8.14          7.7     3.33
 8 "Lebec"                                4.44          4.2     2.60
 9 "Long Beach (North)"                  18.2          15.5    10.3 
10 "Long Beach (South)"                  12.0          11.4     4.29
11 "Long Beach-Route 710 Near Road"      13.4          12.5     5.67
12 "Los Angeles-North Main Street"       14.6          12.4     8.72
13 "Lynwood"                             23.3          19.8    12.0 
14 "Mission Viejo"                       12.0           9.8     7.96
15 "North Hollywood (NOHO)"              13.0          13.1     4.75
16 "Pasadena"                            14.7          11.8    10.0 
17 "Pico Rivera #2"                      11.4          10.0     5.93
18 "Reseda"                              12.3          11       7.05
19 "Santa Clarita"                        9.14          8.5     3.90
20 "Signal Hill (LBSH)"                   8.85          8.5     4.42

Based on the summary table, Lebec has the lowest mean PM2.5 (4.439333), while Burbank has the highest mean PM2.5 (23.969672). The boxplot tells the distribution (including the outliers) of each site in LA. The histogram shows the frequency of the PM2.5 concentration at each site.